由於這陣子在搞的東西跟CNV有非常大的關係,所以也陸續參考了不少文獻。前面我提過了絕大多數的演算法都是基於read depth
這個值衍生出來的,而我那時也寫了一個閹割版的BEDtools makewindows
的功能,今天我抽了點時間把前面那一步怎麼產生genomeInfo file的功能也寫好,搭配後面計算每個windows中reads數目的小工具來做一個介紹。
BioSequences.jl
計算每個chromosome的長度using BioSequences.FASTA
using DataFrames
using CSV
function getChromLenFromRef(fasta::BioSequences.FASTA.Reader, output::String)
IDs = Array{String}(undef, 0)
Length = Array{Int}(undef, 0)
for record in fasta
id = identifier(record)
seq = FASTA.sequence(record)
seqlen = length(seq)
push!(IDs, id)
push!(Length, seqlen)
end
outputDF = DataFrame(:Chromosome => IDs, :Length => Length)
CSV.write(output, outputDF, delim = "\t")
end
genomefile = "human_GRCh38_p12.fa" ## 這是從NCBI上面下載來的人類參考基因體序列
fastaInfo = FASTA.Reader(open(genomefile, "r"))
genomeinfofile = "genomeInfoFile.txt"
getChromeLenFromRef(fastaInfo, genomeinfofile)
run(`sed -i '1d' $genomeinfofile`)
makeWindowsForBED.jl
function makeWindows(df::DataFrame, winSize::Int, windowsBED::String)
Chr = Array{String}(undef, 1)
Start = Array{Int}(undef, 1)
Stop = Array{Int}(undef, 1)
for c = 1:nrow(df)
chr = df.chr[c]
chrLen = df.chrlen[c]
winNum = Int(floor(chrLen/winSize))
restLen = chrLen % winSize
for i = 0:winNum
startpos = i * winSize
if i != winNum
endpos = startpos + winSize
else
endpos = startpos + restLen
end
push!(Chr, chr)
push!(Start, startpos)
push!(Stop, endpos)
end
end
winDF = DataFrame(:chr => Chr, :start => Start, :stop => Stop)
CSV.write(windowsBED, winDF, delim = "\t")
end
windowSize = 20000
windowsBED = "windows.bed"
df = CSV.read(genomeinfofile, header = ["chr", "chrlen"], delim = "\t")
makeWindows(df, windowSize, windowsBED)
run(`sed -i '1d' `)
countFile = "sample.count"
run(pipeline(`bam2bed < input.bam`, `awk '$5 > 35 {print $0}'`, stdout=pipeline(`bedmap --count $windowsBED -`, "$countFile")))
接下來就剩下一堆統計分析了